Import Libraries¶
In [156]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
Dataset Overview¶
In [157]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
In [158]:
print(train.info())
print(train.describe())
print(train.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1460 non-null int64
1 MSSubClass 1460 non-null int64
2 MSZoning 1460 non-null object
3 LotFrontage 1201 non-null float64
4 LotArea 1460 non-null int64
5 Street 1460 non-null object
6 Alley 91 non-null object
7 LotShape 1460 non-null object
8 LandContour 1460 non-null object
9 Utilities 1460 non-null object
10 LotConfig 1460 non-null object
11 LandSlope 1460 non-null object
12 Neighborhood 1460 non-null object
13 Condition1 1460 non-null object
14 Condition2 1460 non-null object
15 BldgType 1460 non-null object
16 HouseStyle 1460 non-null object
17 OverallQual 1460 non-null int64
18 OverallCond 1460 non-null int64
19 YearBuilt 1460 non-null int64
20 YearRemodAdd 1460 non-null int64
21 RoofStyle 1460 non-null object
22 RoofMatl 1460 non-null object
23 Exterior1st 1460 non-null object
24 Exterior2nd 1460 non-null object
25 MasVnrType 1452 non-null object
26 MasVnrArea 1452 non-null float64
27 ExterQual 1460 non-null object
28 ExterCond 1460 non-null object
29 Foundation 1460 non-null object
30 BsmtQual 1423 non-null object
31 BsmtCond 1423 non-null object
32 BsmtExposure 1422 non-null object
33 BsmtFinType1 1423 non-null object
34 BsmtFinSF1 1460 non-null int64
35 BsmtFinType2 1422 non-null object
36 BsmtFinSF2 1460 non-null int64
37 BsmtUnfSF 1460 non-null int64
38 TotalBsmtSF 1460 non-null int64
39 Heating 1460 non-null object
40 HeatingQC 1460 non-null object
41 CentralAir 1460 non-null object
42 Electrical 1459 non-null object
43 1stFlrSF 1460 non-null int64
44 2ndFlrSF 1460 non-null int64
45 LowQualFinSF 1460 non-null int64
46 GrLivArea 1460 non-null int64
47 BsmtFullBath 1460 non-null int64
48 BsmtHalfBath 1460 non-null int64
49 FullBath 1460 non-null int64
50 HalfBath 1460 non-null int64
51 BedroomAbvGr 1460 non-null int64
52 KitchenAbvGr 1460 non-null int64
53 KitchenQual 1460 non-null object
54 TotRmsAbvGrd 1460 non-null int64
55 Functional 1460 non-null object
56 Fireplaces 1460 non-null int64
57 FireplaceQu 770 non-null object
58 GarageType 1379 non-null object
59 GarageYrBlt 1379 non-null float64
60 GarageFinish 1379 non-null object
61 GarageCars 1460 non-null int64
62 GarageArea 1460 non-null int64
63 GarageQual 1379 non-null object
64 GarageCond 1379 non-null object
65 PavedDrive 1460 non-null object
66 WoodDeckSF 1460 non-null int64
67 OpenPorchSF 1460 non-null int64
68 EnclosedPorch 1460 non-null int64
69 3SsnPorch 1460 non-null int64
70 ScreenPorch 1460 non-null int64
71 PoolArea 1460 non-null int64
72 PoolQC 7 non-null object
73 Fence 281 non-null object
74 MiscFeature 54 non-null object
75 MiscVal 1460 non-null int64
76 MoSold 1460 non-null int64
77 YrSold 1460 non-null int64
78 SaleType 1460 non-null object
79 SaleCondition 1460 non-null object
80 SalePrice 1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
None
Id MSSubClass LotFrontage LotArea OverallQual \
count 1460.000000 1460.000000 1201.000000 1460.000000 1460.000000
mean 730.500000 56.897260 70.049958 10516.828082 6.099315
std 421.610009 42.300571 24.284752 9981.264932 1.382997
min 1.000000 20.000000 21.000000 1300.000000 1.000000
25% 365.750000 20.000000 59.000000 7553.500000 5.000000
50% 730.500000 50.000000 69.000000 9478.500000 6.000000
75% 1095.250000 70.000000 80.000000 11601.500000 7.000000
max 1460.000000 190.000000 313.000000 215245.000000 10.000000
OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... \
count 1460.000000 1460.000000 1460.000000 1452.000000 1460.000000 ...
mean 5.575342 1971.267808 1984.865753 103.685262 443.639726 ...
std 1.112799 30.202904 20.645407 181.066207 456.098091 ...
min 1.000000 1872.000000 1950.000000 0.000000 0.000000 ...
25% 5.000000 1954.000000 1967.000000 0.000000 0.000000 ...
50% 5.000000 1973.000000 1994.000000 0.000000 383.500000 ...
75% 6.000000 2000.000000 2004.000000 166.000000 712.250000 ...
max 9.000000 2010.000000 2010.000000 1600.000000 5644.000000 ...
WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch \
count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000
mean 94.244521 46.660274 21.954110 3.409589 15.060959
std 125.338794 66.256028 61.119149 29.317331 55.757415
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 25.000000 0.000000 0.000000 0.000000
75% 168.000000 68.000000 0.000000 0.000000 0.000000
max 857.000000 547.000000 552.000000 508.000000 480.000000
PoolArea MiscVal MoSold YrSold SalePrice
count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000
mean 2.758904 43.489041 6.321918 2007.815753 180921.195890
std 40.177307 496.123024 2.703626 1.328095 79442.502883
min 0.000000 0.000000 1.000000 2006.000000 34900.000000
25% 0.000000 0.000000 5.000000 2007.000000 129975.000000
50% 0.000000 0.000000 6.000000 2008.000000 163000.000000
75% 0.000000 0.000000 8.000000 2009.000000 214000.000000
max 738.000000 15500.000000 12.000000 2010.000000 755000.000000
[8 rows x 38 columns]
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \
0 1 60 RL 65.0 8450 Pave NaN Reg
1 2 20 RL 80.0 9600 Pave NaN Reg
2 3 60 RL 68.0 11250 Pave NaN IR1
3 4 70 RL 60.0 9550 Pave NaN IR1
4 5 60 RL 84.0 14260 Pave NaN IR1
LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold \
0 Lvl AllPub ... 0 NaN NaN NaN 0 2
1 Lvl AllPub ... 0 NaN NaN NaN 0 5
2 Lvl AllPub ... 0 NaN NaN NaN 0 9
3 Lvl AllPub ... 0 NaN NaN NaN 0 2
4 Lvl AllPub ... 0 NaN NaN NaN 0 12
YrSold SaleType SaleCondition SalePrice
0 2008 WD Normal 208500
1 2007 WD Normal 181500
2 2008 WD Normal 223500
3 2006 WD Abnorml 140000
4 2008 WD Normal 250000
[5 rows x 81 columns]
Missing Data¶
In [159]:
missing_data = train.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
missing_features = train.columns[train.isnull().any()].tolist()
PoolQC 1453 MiscFeature 1406 Alley 1369 Fence 1179 FireplaceQu 690 LotFrontage 259 GarageYrBlt 81 GarageCond 81 GarageType 81 GarageFinish 81 GarageQual 81 BsmtFinType2 38 BsmtExposure 38 BsmtQual 37 BsmtCond 37 BsmtFinType1 37 MasVnrArea 8 MasVnrType 8 Electrical 1 dtype: int64
Visualize Price Distribution¶
In [160]:
# Plot SalePrice distribution (original)
sns.set(style="whitegrid")
plt.figure(figsize=(10, 5))
sns.histplot(train['SalePrice'], kde=True, bins=40, color='skyblue')
plt.title('Sale Price Distribution (Original Scale)', fontsize=14)
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Plot SalePrice distribution (log-transformed)
plt.figure(figsize=(10, 5))
sns.histplot(np.log1p(train['SalePrice']), kde=True, bins=40, color='salmon')
plt.title('Sale Price Distribution (Log-Transformed)', fontsize=14)
plt.xlabel('Log(Sale Price)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
Visualize Features vs Target¶
In [161]:
all_features = [col for col in train.columns if col not in ['Id', 'SalePrice']]
for feature in all_features:
plt.figure(figsize=(10, 6))
if train[feature].dtype == 'object': # Categorical features
sns.boxplot(data=train, x=feature, y='SalePrice')
plt.title(f'Boxplot of SalePrice by {feature}')
plt.xticks(rotation=45)
else: # Numerical features
sns.scatterplot(data=train, x=feature, y='SalePrice', alpha=0.6)
plt.title(f'Scatterplot of SalePrice vs {feature}')
plt.xlabel(feature)
plt.ylabel('SalePrice')
plt.tight_layout()
save_path = os.path.join("..\\visualizations", f"{feature}_SalePrice_plot.png")
plt.savefig(save_path, bbox_inches='tight')
plt.show()
plt.close()
Visualize Feature Correlation Heatmap¶
In [162]:
# Only numeric columns
numeric_df = train.select_dtypes(include=[np.number])
# Compute correlation matrix
corr_matrix = numeric_df.corr()
# Sort features with highest absolute correlation to SalePrice
target_corr = corr_matrix['SalePrice'].drop('SalePrice').abs().sort_values(ascending=False)
# Select top 20 most correlated features with SalePrice
top_features = target_corr.head(20).index.tolist() + ['SalePrice']
# Plot heatmap
plt.figure(figsize=(14, 10))
sns.set(font_scale=1.1) # Slightly increase font size
sns.heatmap(train[top_features].corr(), annot=True, fmt=".2f", cmap="coolwarm",
cbar_kws={"label": "Correlation Coefficient"}, square=True)
plt.title("Top Feature Correlations with SalePrice", fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
Data Cleaning¶
In [163]:
# PoolQC
train['PoolQC'].fillna('NoPool', inplace=True)
pool_map = {'NoPool': 0, 'Fa': 1, 'Gd': 2, 'Ex': 3}
train['PoolQC'] = train['PoolQC'].map(pool_map)
In [164]:
# MisFeature
train['MiscFeature'].fillna('None', inplace=True)
misc_map = {'None': 0, 'Shed': 1, 'Gar2': 2, 'Othr': 3, 'TenC': 4}
train['MiscFeature'] = train['MiscFeature'].map(misc_map)
In [165]:
# Alley
train['Alley'].fillna('NoAlley', inplace=True)
alley_map = {'NoAlley': 0, 'Grvl': 1, 'Pave': 2}
train['Alley'] = train['Alley'].map(alley_map)
In [166]:
# Fence
train['Fence'].fillna('NoFence', inplace=True)
fence_map = {'NoFence': 0, 'MnWw': 1, 'MnPrv': 2, 'GdWo': 3, 'GdPrv': 4}
train['Fence'] = train['Fence'].map(fence_map)
In [167]:
# FireplaceQu
train['FireplaceQu'].fillna('NoFireplace', inplace=True)
fireplace_map = {'NoFireplace': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
train['FireplaceQu'] = train['FireplaceQu'].map(fireplace_map)
In [168]:
# LotFrontage
train['LotFrontage'] = train.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
In [169]:
# GarageYrBlt GarageFinish, GarageQual, GarageCond
train['GarageYrBlt'].fillna(0, inplace=True)
garage_categorical_features = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for col in garage_categorical_features:
train[col].fillna('NoGarage', inplace=True)
garage_type_map = {'NoGarage': 0, 'Attchd': 1, 'Detchd': 2, 'BuiltIn': 3, 'Basment': 4, 'CarPort': 5, '2Types': 6}
train['GarageType'] = train['GarageType'].map(garage_type_map)
garage_finish_map = {'NoGarage': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
train['GarageFinish'] = train['GarageFinish'].map(garage_finish_map)
garage_qual_cond_map = {'NoGarage': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
train['GarageQual'] = train['GarageQual'].map(garage_qual_cond_map)
train['GarageCond'] = train['GarageCond'].map(garage_qual_cond_map)
In [170]:
# BsmtFinType2, BsmtExposure, BsmtQual, BsmtCond, BsmtFinType1
bsmt_categorical_features = ['BsmtFinType1', 'BsmtFinType2', 'BsmtExposure', 'BsmtQual', 'BsmtCond']
for col in bsmt_categorical_features:
train[col].fillna('NoBasement', inplace=True)
bsmt_numerical_features = ['TotalBsmtSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtFullBath', 'BsmtHalfBath']
for col in bsmt_numerical_features:
if col in train.columns:
train[col].fillna(0, inplace=True)
bsmt_fin_type_map = {'NoBasement': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
train['BsmtFinType1'] = train['BsmtFinType1'].map(bsmt_fin_type_map)
train['BsmtFinType2'] = train['BsmtFinType2'].map(bsmt_fin_type_map)
bsmt_exposure_map = {'NoBasement': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
train['BsmtExposure'] = train['BsmtExposure'].map(bsmt_exposure_map)
bsmt_qual_cond_map = {'NoBasement': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
train['BsmtQual'] = train['BsmtQual'].map(bsmt_qual_cond_map)
train['BsmtCond'] = train['BsmtCond'].map(bsmt_qual_cond_map)
In [171]:
# MasVnrType, MasVnrArea
train['MasVnrType'].fillna('None', inplace=True)
train['MasVnrArea'].fillna(0, inplace=True)
masvnr_type_map = {'None': 0, 'BrkCmn': 1, 'BrkFace': 2, 'Stone': 3}
train['MasVnrType'] = train['MasVnrType'].map(masvnr_type_map)
In [172]:
# Electrical
train['Electrical'].fillna('SBrkr', inplace=True)
electrical_map = {'SBrkr': 0, 'FuseA': 1, 'FuseF': 2, 'FuseP': 3, 'Mix': 4}
train['Electrical'] = train['Electrical'].map(electrical_map)
In [173]:
missing_data = train.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
Series([], dtype: int64)
In [174]:
categorical_columns = train.select_dtypes(include=['object']).columns.tolist()
non_categorical_columns = train.select_dtypes(exclude=['object']).columns.tolist()
print("Categorical Columns:")
print(categorical_columns)
print("\nNon-Categorical Columns:")
print(non_categorical_columns)
Categorical Columns: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition'] Non-Categorical Columns: ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'Alley', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
In [175]:
ordinal_features = ['LotShape', 'Utilities', 'LandSlope', 'ExterQual', 'ExterCond',
'HeatingQC', 'KitchenQual', 'Functional', 'PavedDrive', 'CentralAir']
ordinal_mappings = {
'LotShape': {'Reg': 3, 'IR1': 2, 'IR2': 1, 'IR3': 0},
'Utilities': {'AllPub': 3, 'NoSewr': 2, 'NoSeWa': 1, 'ELO': 0},
'LandSlope': {'Gtl': 2, 'Mod': 1, 'Sev': 0},
'ExterQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
'ExterCond': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
'HeatingQC': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
'KitchenQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
'Functional': {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7},
'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
'CentralAir': {'N': 0, 'Y': 1}
}
for feature, mapping in ordinal_mappings.items():
train[feature] = train[feature].map(mapping)
one_hot_features = [col for col in categorical_columns if col not in ordinal_features]
train = pd.get_dummies(train, columns=one_hot_features, drop_first=True)
In [176]:
missing_data = train.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
Series([], dtype: int64)
In [177]:
print(train.info())
print(train.describe())
print(train.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 190 entries, Id to SaleCondition_Partial
dtypes: float64(3), int64(61), uint8(126)
memory usage: 909.8 KB
None
Id MSSubClass LotFrontage LotArea Alley \
count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000
mean 730.500000 56.897260 70.199658 10516.828082 0.090411
std 421.610009 42.300571 22.431902 9981.264932 0.372151
min 1.000000 20.000000 21.000000 1300.000000 0.000000
25% 365.750000 20.000000 60.000000 7553.500000 0.000000
50% 730.500000 50.000000 70.000000 9478.500000 0.000000
75% 1095.250000 70.000000 80.000000 11601.500000 0.000000
max 1460.000000 190.000000 313.000000 215245.000000 2.000000
LotShape Utilities LandSlope OverallQual OverallCond ... \
count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 ...
mean 2.591781 2.998630 1.937671 6.099315 5.575342 ...
std 0.582296 0.052342 0.276232 1.382997 1.112799 ...
min 0.000000 1.000000 0.000000 1.000000 1.000000 ...
25% 2.000000 3.000000 2.000000 5.000000 5.000000 ...
50% 3.000000 3.000000 2.000000 6.000000 5.000000 ...
75% 3.000000 3.000000 2.000000 7.000000 6.000000 ...
max 3.000000 3.000000 2.000000 10.000000 9.000000 ...
SaleType_ConLI SaleType_ConLw SaleType_New SaleType_Oth \
count 1460.000000 1460.000000 1460.000000 1460.000000
mean 0.003425 0.003425 0.083562 0.002055
std 0.058440 0.058440 0.276824 0.045299
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000
SaleType_WD SaleCondition_AdjLand SaleCondition_Alloca \
count 1460.000000 1460.000000 1460.000000
mean 0.867808 0.002740 0.008219
std 0.338815 0.052289 0.090317
min 0.000000 0.000000 0.000000
25% 1.000000 0.000000 0.000000
50% 1.000000 0.000000 0.000000
75% 1.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000
SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
count 1460.000000 1460.000000 1460.000000
mean 0.013699 0.820548 0.085616
std 0.116277 0.383862 0.279893
min 0.000000 0.000000 0.000000
25% 0.000000 1.000000 0.000000
50% 0.000000 1.000000 0.000000
75% 0.000000 1.000000 0.000000
max 1.000000 1.000000 1.000000
[8 rows x 190 columns]
Id MSSubClass LotFrontage LotArea Alley LotShape Utilities \
0 1 60 65.0 8450 0 3 3
1 2 20 80.0 9600 0 3 3
2 3 60 68.0 11250 0 2 3
3 4 70 60.0 9550 0 2 3
4 5 60 84.0 14260 0 2 3
LandSlope OverallQual OverallCond ... SaleType_ConLI SaleType_ConLw \
0 2 7 5 ... 0 0
1 2 6 8 ... 0 0
2 2 7 5 ... 0 0
3 2 7 5 ... 0 0
4 2 8 5 ... 0 0
SaleType_New SaleType_Oth SaleType_WD SaleCondition_AdjLand \
0 0 0 1 0
1 0 0 1 0
2 0 0 1 0
3 0 0 1 0
4 0 0 1 0
SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal \
0 0 0 1
1 0 0 1
2 0 0 1
3 0 0 0
4 0 0 1
SaleCondition_Partial
0 0
1 0
2 0
3 0
4 0
[5 rows x 190 columns]
Save Clean Train Data¶
In [178]:
train.to_csv("../data/clean_train.csv", index=False)
print("Cleaned train dataset saved as clean_train.csv.")
Cleaned train dataset saved as clean_train.csv.
Test Data Overview¶
In [179]:
print(test.info())
print(test.describe())
print(test.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1459 non-null int64
1 MSSubClass 1459 non-null int64
2 MSZoning 1455 non-null object
3 LotFrontage 1232 non-null float64
4 LotArea 1459 non-null int64
5 Street 1459 non-null object
6 Alley 107 non-null object
7 LotShape 1459 non-null object
8 LandContour 1459 non-null object
9 Utilities 1457 non-null object
10 LotConfig 1459 non-null object
11 LandSlope 1459 non-null object
12 Neighborhood 1459 non-null object
13 Condition1 1459 non-null object
14 Condition2 1459 non-null object
15 BldgType 1459 non-null object
16 HouseStyle 1459 non-null object
17 OverallQual 1459 non-null int64
18 OverallCond 1459 non-null int64
19 YearBuilt 1459 non-null int64
20 YearRemodAdd 1459 non-null int64
21 RoofStyle 1459 non-null object
22 RoofMatl 1459 non-null object
23 Exterior1st 1458 non-null object
24 Exterior2nd 1458 non-null object
25 MasVnrType 1443 non-null object
26 MasVnrArea 1444 non-null float64
27 ExterQual 1459 non-null object
28 ExterCond 1459 non-null object
29 Foundation 1459 non-null object
30 BsmtQual 1415 non-null object
31 BsmtCond 1414 non-null object
32 BsmtExposure 1415 non-null object
33 BsmtFinType1 1417 non-null object
34 BsmtFinSF1 1458 non-null float64
35 BsmtFinType2 1417 non-null object
36 BsmtFinSF2 1458 non-null float64
37 BsmtUnfSF 1458 non-null float64
38 TotalBsmtSF 1458 non-null float64
39 Heating 1459 non-null object
40 HeatingQC 1459 non-null object
41 CentralAir 1459 non-null object
42 Electrical 1459 non-null object
43 1stFlrSF 1459 non-null int64
44 2ndFlrSF 1459 non-null int64
45 LowQualFinSF 1459 non-null int64
46 GrLivArea 1459 non-null int64
47 BsmtFullBath 1457 non-null float64
48 BsmtHalfBath 1457 non-null float64
49 FullBath 1459 non-null int64
50 HalfBath 1459 non-null int64
51 BedroomAbvGr 1459 non-null int64
52 KitchenAbvGr 1459 non-null int64
53 KitchenQual 1458 non-null object
54 TotRmsAbvGrd 1459 non-null int64
55 Functional 1457 non-null object
56 Fireplaces 1459 non-null int64
57 FireplaceQu 729 non-null object
58 GarageType 1383 non-null object
59 GarageYrBlt 1381 non-null float64
60 GarageFinish 1381 non-null object
61 GarageCars 1458 non-null float64
62 GarageArea 1458 non-null float64
63 GarageQual 1381 non-null object
64 GarageCond 1381 non-null object
65 PavedDrive 1459 non-null object
66 WoodDeckSF 1459 non-null int64
67 OpenPorchSF 1459 non-null int64
68 EnclosedPorch 1459 non-null int64
69 3SsnPorch 1459 non-null int64
70 ScreenPorch 1459 non-null int64
71 PoolArea 1459 non-null int64
72 PoolQC 3 non-null object
73 Fence 290 non-null object
74 MiscFeature 51 non-null object
75 MiscVal 1459 non-null int64
76 MoSold 1459 non-null int64
77 YrSold 1459 non-null int64
78 SaleType 1458 non-null object
79 SaleCondition 1459 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 912.0+ KB
None
Id MSSubClass LotFrontage LotArea OverallQual \
count 1459.000000 1459.000000 1232.000000 1459.000000 1459.000000
mean 2190.000000 57.378341 68.580357 9819.161069 6.078821
std 421.321334 42.746880 22.376841 4955.517327 1.436812
min 1461.000000 20.000000 21.000000 1470.000000 1.000000
25% 1825.500000 20.000000 58.000000 7391.000000 5.000000
50% 2190.000000 50.000000 67.000000 9399.000000 6.000000
75% 2554.500000 70.000000 80.000000 11517.500000 7.000000
max 2919.000000 190.000000 200.000000 56600.000000 10.000000
OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... \
count 1459.000000 1459.000000 1459.000000 1444.000000 1458.000000 ...
mean 5.553804 1971.357779 1983.662783 100.709141 439.203704 ...
std 1.113740 30.390071 21.130467 177.625900 455.268042 ...
min 1.000000 1879.000000 1950.000000 0.000000 0.000000 ...
25% 5.000000 1953.000000 1963.000000 0.000000 0.000000 ...
50% 5.000000 1973.000000 1992.000000 0.000000 350.500000 ...
75% 6.000000 2001.000000 2004.000000 164.000000 753.500000 ...
max 9.000000 2010.000000 2010.000000 1290.000000 4010.000000 ...
GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \
count 1458.000000 1459.000000 1459.000000 1459.000000 1459.000000
mean 472.768861 93.174777 48.313914 24.243317 1.794380
std 217.048611 127.744882 68.883364 67.227765 20.207842
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 318.000000 0.000000 0.000000 0.000000 0.000000
50% 480.000000 0.000000 28.000000 0.000000 0.000000
75% 576.000000 168.000000 72.000000 0.000000 0.000000
max 1488.000000 1424.000000 742.000000 1012.000000 360.000000
ScreenPorch PoolArea MiscVal MoSold YrSold
count 1459.000000 1459.000000 1459.000000 1459.000000 1459.000000
mean 17.064428 1.744345 58.167923 6.104181 2007.769705
std 56.609763 30.491646 630.806978 2.722432 1.301740
min 0.000000 0.000000 0.000000 1.000000 2006.000000
25% 0.000000 0.000000 0.000000 4.000000 2007.000000
50% 0.000000 0.000000 0.000000 6.000000 2008.000000
75% 0.000000 0.000000 0.000000 8.000000 2009.000000
max 576.000000 800.000000 17000.000000 12.000000 2010.000000
[8 rows x 37 columns]
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \
0 1461 20 RH 80.0 11622 Pave NaN Reg
1 1462 20 RL 81.0 14267 Pave NaN IR1
2 1463 60 RL 74.0 13830 Pave NaN IR1
3 1464 60 RL 78.0 9978 Pave NaN IR1
4 1465 120 RL 43.0 5005 Pave NaN IR1
LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature \
0 Lvl AllPub ... 120 0 NaN MnPrv NaN
1 Lvl AllPub ... 0 0 NaN NaN Gar2
2 Lvl AllPub ... 0 0 NaN MnPrv NaN
3 Lvl AllPub ... 0 0 NaN NaN NaN
4 HLS AllPub ... 144 0 NaN NaN NaN
MiscVal MoSold YrSold SaleType SaleCondition
0 0 6 2010 WD Normal
1 12500 6 2010 WD Normal
2 0 3 2010 WD Normal
3 0 6 2010 WD Normal
4 0 1 2010 WD Normal
[5 rows x 80 columns]
In [180]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
PoolQC 1456 MiscFeature 1408 Alley 1352 Fence 1169 FireplaceQu 730 LotFrontage 227 GarageYrBlt 78 GarageQual 78 GarageFinish 78 GarageCond 78 GarageType 76 BsmtCond 45 BsmtQual 44 BsmtExposure 44 BsmtFinType1 42 BsmtFinType2 42 MasVnrType 16 MasVnrArea 15 MSZoning 4 BsmtHalfBath 2 Utilities 2 Functional 2 BsmtFullBath 2 BsmtFinSF1 1 BsmtFinSF2 1 BsmtUnfSF 1 KitchenQual 1 TotalBsmtSF 1 Exterior2nd 1 GarageCars 1 Exterior1st 1 GarageArea 1 SaleType 1 dtype: int64
Data Cleaning¶
In [181]:
# PoolQC
test['PoolQC'].fillna('NoPool', inplace=True)
pool_map = {'NoPool': 0, 'Fa': 1, 'Gd': 2, 'Ex': 3}
test['PoolQC'] = test['PoolQC'].map(pool_map)
In [182]:
# MisFeature
test['MiscFeature'].fillna('None', inplace=True)
misc_map = {'None': 0, 'Shed': 1, 'Gar2': 2, 'Othr': 3, 'TenC': 4}
test['MiscFeature'] = test['MiscFeature'].map(misc_map)
In [183]:
# Alley
test['Alley'].fillna('NoAlley', inplace=True)
alley_map = {'NoAlley': 0, 'Grvl': 1, 'Pave': 2}
test['Alley'] = test['Alley'].map(alley_map)
In [184]:
# Fence
test['Fence'].fillna('NoFence', inplace=True)
fence_map = {'NoFence': 0, 'MnWw': 1, 'MnPrv': 2, 'GdWo': 3, 'GdPrv': 4}
test['Fence'] = test['Fence'].map(fence_map)
In [185]:
# FireplaceQu
test['FireplaceQu'].fillna('NoFireplace', inplace=True)
fireplace_map = {'NoFireplace': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
test['FireplaceQu'] = test['FireplaceQu'].map(fireplace_map)
In [186]:
# LotFrontage
test['LotFrontage'] = test.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
In [187]:
# GarageYrBlt GarageFinish, GarageQual, GarageCond
test['GarageYrBlt'].fillna(0, inplace=True)
garage_categorical_features = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for col in garage_categorical_features:
test[col].fillna('NoGarage', inplace=True)
garage_type_map = {'NoGarage': 0, 'Attchd': 1, 'Detchd': 2, 'BuiltIn': 3, 'Basment': 4, 'CarPort': 5, '2Types': 6}
test['GarageType'] = test['GarageType'].map(garage_type_map)
garage_finish_map = {'NoGarage': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
test['GarageFinish'] = test['GarageFinish'].map(garage_finish_map)
garage_qual_cond_map = {'NoGarage': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
test['GarageQual'] = test['GarageQual'].map(garage_qual_cond_map)
test['GarageCond'] = test['GarageCond'].map(garage_qual_cond_map)
In [188]:
# BsmtFinType2, BsmtExposure, BsmtQual, BsmtCond, BsmtFinType1
bsmt_categorical_features = ['BsmtFinType1', 'BsmtFinType2', 'BsmtExposure', 'BsmtQual', 'BsmtCond']
for col in bsmt_categorical_features:
test[col].fillna('NoBasement', inplace=True)
bsmt_numerical_features = ['TotalBsmtSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtFullBath', 'BsmtHalfBath']
for col in bsmt_numerical_features:
if col in test.columns:
test[col].fillna(0, inplace=True)
bsmt_fin_type_map = {'NoBasement': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
test['BsmtFinType1'] = test['BsmtFinType1'].map(bsmt_fin_type_map)
test['BsmtFinType2'] = test['BsmtFinType2'].map(bsmt_fin_type_map)
bsmt_exposure_map = {'NoBasement': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
test['BsmtExposure'] = test['BsmtExposure'].map(bsmt_exposure_map)
bsmt_qual_cond_map = {'NoBasement': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
test['BsmtQual'] = test['BsmtQual'].map(bsmt_qual_cond_map)
test['BsmtCond'] = test['BsmtCond'].map(bsmt_qual_cond_map)
In [189]:
# MasVnrType, MasVnrArea
test['MasVnrType'].fillna('None', inplace=True)
test['MasVnrArea'].fillna(0, inplace=True)
masvnr_type_map = {'None': 0, 'BrkCmn': 1, 'BrkFace': 2, 'Stone': 3}
test['MasVnrType'] = test['MasVnrType'].map(masvnr_type_map)
In [190]:
# Electrical
test['Electrical'].fillna('SBrkr', inplace=True)
electrical_map = {'SBrkr': 0, 'FuseA': 1, 'FuseF': 2, 'FuseP': 3, 'Mix': 4}
test['Electrical'] = test['Electrical'].map(electrical_map)
In [191]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
MSZoning 4 Utilities 2 Functional 2 GarageCars 1 GarageArea 1 KitchenQual 1 Exterior1st 1 Exterior2nd 1 SaleType 1 dtype: int64
In [192]:
train1 = pd.read_csv('../data/train.csv')
test['Utilities'].fillna(train1['Utilities'].mode()[0], inplace=True)
test['Functional'].fillna(train1['Functional'].mode()[0], inplace=True)
test['GarageCars'].fillna(0, inplace=True)
test['GarageArea'].fillna(0, inplace=True)
test['KitchenQual'].fillna(train1['KitchenQual'].mode()[0], inplace=True)
test['MSZoning'].fillna(train1['MSZoning'].mode()[0], inplace=True)
test['Exterior1st'].fillna(train1['Exterior1st'].mode()[0], inplace=True)
test['Exterior2nd'].fillna(train1['Exterior2nd'].mode()[0], inplace=True)
test['SaleType'].fillna(train1['SaleType'].mode()[0], inplace=True)
In [193]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
print(test.info())
Series([], dtype: int64) <class 'pandas.core.frame.DataFrame'> RangeIndex: 1459 entries, 0 to 1458 Data columns (total 80 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 1459 non-null int64 1 MSSubClass 1459 non-null int64 2 MSZoning 1459 non-null object 3 LotFrontage 1459 non-null float64 4 LotArea 1459 non-null int64 5 Street 1459 non-null object 6 Alley 1459 non-null int64 7 LotShape 1459 non-null object 8 LandContour 1459 non-null object 9 Utilities 1459 non-null object 10 LotConfig 1459 non-null object 11 LandSlope 1459 non-null object 12 Neighborhood 1459 non-null object 13 Condition1 1459 non-null object 14 Condition2 1459 non-null object 15 BldgType 1459 non-null object 16 HouseStyle 1459 non-null object 17 OverallQual 1459 non-null int64 18 OverallCond 1459 non-null int64 19 YearBuilt 1459 non-null int64 20 YearRemodAdd 1459 non-null int64 21 RoofStyle 1459 non-null object 22 RoofMatl 1459 non-null object 23 Exterior1st 1459 non-null object 24 Exterior2nd 1459 non-null object 25 MasVnrType 1459 non-null int64 26 MasVnrArea 1459 non-null float64 27 ExterQual 1459 non-null object 28 ExterCond 1459 non-null object 29 Foundation 1459 non-null object 30 BsmtQual 1459 non-null int64 31 BsmtCond 1459 non-null int64 32 BsmtExposure 1459 non-null int64 33 BsmtFinType1 1459 non-null int64 34 BsmtFinSF1 1459 non-null float64 35 BsmtFinType2 1459 non-null int64 36 BsmtFinSF2 1459 non-null float64 37 BsmtUnfSF 1459 non-null float64 38 TotalBsmtSF 1459 non-null float64 39 Heating 1459 non-null object 40 HeatingQC 1459 non-null object 41 CentralAir 1459 non-null object 42 Electrical 1459 non-null int64 43 1stFlrSF 1459 non-null int64 44 2ndFlrSF 1459 non-null int64 45 LowQualFinSF 1459 non-null int64 46 GrLivArea 1459 non-null int64 47 BsmtFullBath 1459 non-null float64 48 BsmtHalfBath 1459 non-null float64 49 FullBath 1459 non-null int64 50 HalfBath 1459 non-null int64 51 BedroomAbvGr 1459 non-null int64 52 KitchenAbvGr 1459 non-null int64 53 KitchenQual 1459 non-null object 54 TotRmsAbvGrd 1459 non-null int64 55 Functional 1459 non-null object 56 Fireplaces 1459 non-null int64 57 FireplaceQu 1459 non-null int64 58 GarageType 1459 non-null int64 59 GarageYrBlt 1459 non-null float64 60 GarageFinish 1459 non-null int64 61 GarageCars 1459 non-null float64 62 GarageArea 1459 non-null float64 63 GarageQual 1459 non-null int64 64 GarageCond 1459 non-null int64 65 PavedDrive 1459 non-null object 66 WoodDeckSF 1459 non-null int64 67 OpenPorchSF 1459 non-null int64 68 EnclosedPorch 1459 non-null int64 69 3SsnPorch 1459 non-null int64 70 ScreenPorch 1459 non-null int64 71 PoolArea 1459 non-null int64 72 PoolQC 1459 non-null int64 73 Fence 1459 non-null int64 74 MiscFeature 1459 non-null int64 75 MiscVal 1459 non-null int64 76 MoSold 1459 non-null int64 77 YrSold 1459 non-null int64 78 SaleType 1459 non-null object 79 SaleCondition 1459 non-null object dtypes: float64(11), int64(42), object(27) memory usage: 912.0+ KB None
In [194]:
categorical_columns = test.select_dtypes(include=['object']).columns.tolist()
non_categorical_columns = test.select_dtypes(exclude=['object']).columns.tolist()
print("Categorical Columns:")
print(categorical_columns)
print("\nNon-Categorical Columns:")
print(non_categorical_columns)
Categorical Columns: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition'] Non-Categorical Columns: ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'Alley', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold']
In [195]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
print(test.info())
missing_cols = set(train.columns) - set(test.columns)
print(missing_cols)
Series([], dtype: int64)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1459 non-null int64
1 MSSubClass 1459 non-null int64
2 MSZoning 1459 non-null object
3 LotFrontage 1459 non-null float64
4 LotArea 1459 non-null int64
5 Street 1459 non-null object
6 Alley 1459 non-null int64
7 LotShape 1459 non-null object
8 LandContour 1459 non-null object
9 Utilities 1459 non-null object
10 LotConfig 1459 non-null object
11 LandSlope 1459 non-null object
12 Neighborhood 1459 non-null object
13 Condition1 1459 non-null object
14 Condition2 1459 non-null object
15 BldgType 1459 non-null object
16 HouseStyle 1459 non-null object
17 OverallQual 1459 non-null int64
18 OverallCond 1459 non-null int64
19 YearBuilt 1459 non-null int64
20 YearRemodAdd 1459 non-null int64
21 RoofStyle 1459 non-null object
22 RoofMatl 1459 non-null object
23 Exterior1st 1459 non-null object
24 Exterior2nd 1459 non-null object
25 MasVnrType 1459 non-null int64
26 MasVnrArea 1459 non-null float64
27 ExterQual 1459 non-null object
28 ExterCond 1459 non-null object
29 Foundation 1459 non-null object
30 BsmtQual 1459 non-null int64
31 BsmtCond 1459 non-null int64
32 BsmtExposure 1459 non-null int64
33 BsmtFinType1 1459 non-null int64
34 BsmtFinSF1 1459 non-null float64
35 BsmtFinType2 1459 non-null int64
36 BsmtFinSF2 1459 non-null float64
37 BsmtUnfSF 1459 non-null float64
38 TotalBsmtSF 1459 non-null float64
39 Heating 1459 non-null object
40 HeatingQC 1459 non-null object
41 CentralAir 1459 non-null object
42 Electrical 1459 non-null int64
43 1stFlrSF 1459 non-null int64
44 2ndFlrSF 1459 non-null int64
45 LowQualFinSF 1459 non-null int64
46 GrLivArea 1459 non-null int64
47 BsmtFullBath 1459 non-null float64
48 BsmtHalfBath 1459 non-null float64
49 FullBath 1459 non-null int64
50 HalfBath 1459 non-null int64
51 BedroomAbvGr 1459 non-null int64
52 KitchenAbvGr 1459 non-null int64
53 KitchenQual 1459 non-null object
54 TotRmsAbvGrd 1459 non-null int64
55 Functional 1459 non-null object
56 Fireplaces 1459 non-null int64
57 FireplaceQu 1459 non-null int64
58 GarageType 1459 non-null int64
59 GarageYrBlt 1459 non-null float64
60 GarageFinish 1459 non-null int64
61 GarageCars 1459 non-null float64
62 GarageArea 1459 non-null float64
63 GarageQual 1459 non-null int64
64 GarageCond 1459 non-null int64
65 PavedDrive 1459 non-null object
66 WoodDeckSF 1459 non-null int64
67 OpenPorchSF 1459 non-null int64
68 EnclosedPorch 1459 non-null int64
69 3SsnPorch 1459 non-null int64
70 ScreenPorch 1459 non-null int64
71 PoolArea 1459 non-null int64
72 PoolQC 1459 non-null int64
73 Fence 1459 non-null int64
74 MiscFeature 1459 non-null int64
75 MiscVal 1459 non-null int64
76 MoSold 1459 non-null int64
77 YrSold 1459 non-null int64
78 SaleType 1459 non-null object
79 SaleCondition 1459 non-null object
dtypes: float64(11), int64(42), object(27)
memory usage: 912.0+ KB
None
{'Exterior1st_AsphShn', 'MSZoning_RL', 'Exterior2nd_ImStucc', 'Condition1_RRAe', 'Condition1_RRNe', 'Neighborhood_SawyerW', 'MSZoning_RM', 'Condition1_Norm', 'HouseStyle_2.5Unf', 'MSZoning_RH', 'SaleType_New', 'SaleCondition_Normal', 'Neighborhood_NWAmes', 'Exterior1st_BrkComm', 'Neighborhood_BrkSide', 'Condition1_Feedr', 'Neighborhood_NPkVill', 'RoofMatl_CompShg', 'SaleType_Con', 'Exterior2nd_MetalSd', 'Foundation_CBlock', 'Foundation_Stone', 'Condition2_RRAe', 'Neighborhood_IDOTRR', 'Exterior1st_Plywood', 'Neighborhood_NridgHt', 'Exterior2nd_CBlock', 'MSZoning_FV', 'SaleCondition_AdjLand', 'Neighborhood_NoRidge', 'HouseStyle_1Story', 'Condition2_PosN', 'Neighborhood_Somerst', 'SaleCondition_Alloca', 'Exterior2nd_Plywood', 'Exterior1st_WdShing', 'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_StoneBr', 'RoofMatl_Membran', 'Condition2_RRAn', 'Exterior1st_CBlock', 'Exterior1st_Wd Sdng', 'SaleType_Oth', 'Neighborhood_NAmes', 'RoofMatl_WdShngl', 'Exterior2nd_Stone', 'BldgType_2fmCon', 'Foundation_Wood', 'Foundation_PConc', 'LotConfig_CulDSac', 'LandContour_Low', 'Exterior2nd_HdBoard', 'Exterior2nd_VinylSd', 'Exterior2nd_Brk Cmn', 'Foundation_Slab', 'Exterior2nd_Wd Sdng', 'Street_Pave', 'SalePrice', 'HouseStyle_SLvl', 'RoofMatl_WdShake', 'SaleType_WD', 'HouseStyle_1.5Unf', 'Condition2_RRNn', 'Exterior1st_CemntBd', 'Neighborhood_Gilbert', 'Condition2_PosA', 'LotConfig_FR2', 'Condition2_Norm', 'BldgType_TwnhsE', 'HouseStyle_SFoyer', 'BldgType_Twnhs', 'SaleCondition_Partial', 'Exterior1st_MetalSd', 'SaleType_ConLw', 'Condition1_PosN', 'Neighborhood_CollgCr', 'Exterior2nd_Wd Shng', 'LotConfig_Inside', 'LandContour_Lvl', 'RoofStyle_Gambrel', 'RoofStyle_Mansard', 'Exterior1st_Stone', 'SaleType_ConLI', 'Neighborhood_MeadowV', 'Neighborhood_Timber', 'RoofStyle_Shed', 'Heating_Wall', 'Exterior1st_ImStucc', 'BldgType_Duplex', 'Neighborhood_Edwards', 'Exterior1st_HdBoard', 'Exterior2nd_CmentBd', 'Condition2_Feedr', 'Neighborhood_Veenker', 'RoofMatl_Metal', 'Neighborhood_Crawfor', 'Exterior1st_BrkFace', 'Exterior2nd_Stucco', 'RoofStyle_Gable', 'LotConfig_FR3', 'Heating_OthW', 'RoofMatl_Tar&Grv', 'Exterior1st_Stucco', 'Exterior1st_VinylSd', 'HouseStyle_2.5Fin', 'Neighborhood_BrDale', 'Condition1_RRNn', 'Condition1_PosA', 'Neighborhood_Mitchel', 'HouseStyle_2Story', 'Heating_GasA', 'SaleType_CWD', 'Heating_Grav', 'Exterior2nd_BrkFace', 'SaleCondition_Family', 'Exterior2nd_AsphShn', 'LandContour_HLS', 'Neighborhood_Blueste', 'Exterior2nd_Other', 'Neighborhood_Sawyer', 'Heating_GasW', 'RoofStyle_Hip', 'Neighborhood_ClearCr', 'SaleType_ConLD', 'RoofMatl_Roll', 'Condition1_RRAn'}
In [196]:
ordinal_mappings = {
'LotShape': {'Reg': 3, 'IR1': 2, 'IR2': 1, 'IR3': 0},
'Utilities': {'AllPub': 3, 'NoSewr': 2, 'NoSeWa': 1, 'ELO': 0},
'LandSlope': {'Gtl': 2, 'Mod': 1, 'Sev': 0},
'ExterQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
'ExterCond': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
'HeatingQC': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
'KitchenQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
'Functional': {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7},
'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
'CentralAir': {'N': 0, 'Y': 1}
}
ordinal_features = list(ordinal_mappings.keys())
for feature, mapping in ordinal_mappings.items():
if feature in test.columns:
test[feature] = test[feature].map(mapping)
In [197]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
print(test.info())
missing_cols = set(train.columns) - set(test.columns)
print(missing_cols)
Series([], dtype: int64)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1459 non-null int64
1 MSSubClass 1459 non-null int64
2 MSZoning 1459 non-null object
3 LotFrontage 1459 non-null float64
4 LotArea 1459 non-null int64
5 Street 1459 non-null object
6 Alley 1459 non-null int64
7 LotShape 1459 non-null int64
8 LandContour 1459 non-null object
9 Utilities 1459 non-null int64
10 LotConfig 1459 non-null object
11 LandSlope 1459 non-null int64
12 Neighborhood 1459 non-null object
13 Condition1 1459 non-null object
14 Condition2 1459 non-null object
15 BldgType 1459 non-null object
16 HouseStyle 1459 non-null object
17 OverallQual 1459 non-null int64
18 OverallCond 1459 non-null int64
19 YearBuilt 1459 non-null int64
20 YearRemodAdd 1459 non-null int64
21 RoofStyle 1459 non-null object
22 RoofMatl 1459 non-null object
23 Exterior1st 1459 non-null object
24 Exterior2nd 1459 non-null object
25 MasVnrType 1459 non-null int64
26 MasVnrArea 1459 non-null float64
27 ExterQual 1459 non-null int64
28 ExterCond 1459 non-null int64
29 Foundation 1459 non-null object
30 BsmtQual 1459 non-null int64
31 BsmtCond 1459 non-null int64
32 BsmtExposure 1459 non-null int64
33 BsmtFinType1 1459 non-null int64
34 BsmtFinSF1 1459 non-null float64
35 BsmtFinType2 1459 non-null int64
36 BsmtFinSF2 1459 non-null float64
37 BsmtUnfSF 1459 non-null float64
38 TotalBsmtSF 1459 non-null float64
39 Heating 1459 non-null object
40 HeatingQC 1459 non-null int64
41 CentralAir 1459 non-null int64
42 Electrical 1459 non-null int64
43 1stFlrSF 1459 non-null int64
44 2ndFlrSF 1459 non-null int64
45 LowQualFinSF 1459 non-null int64
46 GrLivArea 1459 non-null int64
47 BsmtFullBath 1459 non-null float64
48 BsmtHalfBath 1459 non-null float64
49 FullBath 1459 non-null int64
50 HalfBath 1459 non-null int64
51 BedroomAbvGr 1459 non-null int64
52 KitchenAbvGr 1459 non-null int64
53 KitchenQual 1459 non-null int64
54 TotRmsAbvGrd 1459 non-null int64
55 Functional 1459 non-null int64
56 Fireplaces 1459 non-null int64
57 FireplaceQu 1459 non-null int64
58 GarageType 1459 non-null int64
59 GarageYrBlt 1459 non-null float64
60 GarageFinish 1459 non-null int64
61 GarageCars 1459 non-null float64
62 GarageArea 1459 non-null float64
63 GarageQual 1459 non-null int64
64 GarageCond 1459 non-null int64
65 PavedDrive 1459 non-null int64
66 WoodDeckSF 1459 non-null int64
67 OpenPorchSF 1459 non-null int64
68 EnclosedPorch 1459 non-null int64
69 3SsnPorch 1459 non-null int64
70 ScreenPorch 1459 non-null int64
71 PoolArea 1459 non-null int64
72 PoolQC 1459 non-null int64
73 Fence 1459 non-null int64
74 MiscFeature 1459 non-null int64
75 MiscVal 1459 non-null int64
76 MoSold 1459 non-null int64
77 YrSold 1459 non-null int64
78 SaleType 1459 non-null object
79 SaleCondition 1459 non-null object
dtypes: float64(11), int64(52), object(17)
memory usage: 912.0+ KB
None
{'Exterior1st_AsphShn', 'MSZoning_RL', 'Exterior2nd_ImStucc', 'Condition1_RRAe', 'Condition1_RRNe', 'Neighborhood_SawyerW', 'MSZoning_RM', 'Condition1_Norm', 'HouseStyle_2.5Unf', 'MSZoning_RH', 'SaleType_New', 'SaleCondition_Normal', 'Neighborhood_NWAmes', 'Exterior1st_BrkComm', 'Neighborhood_BrkSide', 'Condition1_Feedr', 'Neighborhood_NPkVill', 'RoofMatl_CompShg', 'SaleType_Con', 'Exterior2nd_MetalSd', 'Foundation_CBlock', 'Foundation_Stone', 'Condition2_RRAe', 'Neighborhood_IDOTRR', 'Exterior1st_Plywood', 'Neighborhood_NridgHt', 'Exterior2nd_CBlock', 'MSZoning_FV', 'SaleCondition_AdjLand', 'Neighborhood_NoRidge', 'HouseStyle_1Story', 'Condition2_PosN', 'Neighborhood_Somerst', 'SaleCondition_Alloca', 'Exterior2nd_Plywood', 'Exterior1st_WdShing', 'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_StoneBr', 'RoofMatl_Membran', 'Condition2_RRAn', 'Exterior1st_CBlock', 'Exterior1st_Wd Sdng', 'SaleType_Oth', 'Neighborhood_NAmes', 'RoofMatl_WdShngl', 'Exterior2nd_Stone', 'BldgType_2fmCon', 'Foundation_Wood', 'Foundation_PConc', 'LotConfig_CulDSac', 'LandContour_Low', 'Exterior2nd_HdBoard', 'Exterior2nd_VinylSd', 'Exterior2nd_Brk Cmn', 'Foundation_Slab', 'Exterior2nd_Wd Sdng', 'Street_Pave', 'SalePrice', 'HouseStyle_SLvl', 'RoofMatl_WdShake', 'SaleType_WD', 'HouseStyle_1.5Unf', 'Condition2_RRNn', 'Exterior1st_CemntBd', 'Neighborhood_Gilbert', 'Condition2_PosA', 'LotConfig_FR2', 'Condition2_Norm', 'BldgType_TwnhsE', 'HouseStyle_SFoyer', 'BldgType_Twnhs', 'SaleCondition_Partial', 'Exterior1st_MetalSd', 'SaleType_ConLw', 'Condition1_PosN', 'Neighborhood_CollgCr', 'Exterior2nd_Wd Shng', 'LotConfig_Inside', 'LandContour_Lvl', 'RoofStyle_Gambrel', 'RoofStyle_Mansard', 'Exterior1st_Stone', 'SaleType_ConLI', 'Neighborhood_MeadowV', 'Neighborhood_Timber', 'RoofStyle_Shed', 'Heating_Wall', 'Exterior1st_ImStucc', 'BldgType_Duplex', 'Neighborhood_Edwards', 'Exterior1st_HdBoard', 'Exterior2nd_CmentBd', 'Condition2_Feedr', 'Neighborhood_Veenker', 'RoofMatl_Metal', 'Neighborhood_Crawfor', 'Exterior1st_BrkFace', 'Exterior2nd_Stucco', 'RoofStyle_Gable', 'LotConfig_FR3', 'Heating_OthW', 'RoofMatl_Tar&Grv', 'Exterior1st_Stucco', 'Exterior1st_VinylSd', 'HouseStyle_2.5Fin', 'Neighborhood_BrDale', 'Condition1_RRNn', 'Condition1_PosA', 'Neighborhood_Mitchel', 'HouseStyle_2Story', 'Heating_GasA', 'SaleType_CWD', 'Heating_Grav', 'Exterior2nd_BrkFace', 'SaleCondition_Family', 'Exterior2nd_AsphShn', 'LandContour_HLS', 'Neighborhood_Blueste', 'Exterior2nd_Other', 'Neighborhood_Sawyer', 'Heating_GasW', 'RoofStyle_Hip', 'Neighborhood_ClearCr', 'SaleType_ConLD', 'RoofMatl_Roll', 'Condition1_RRAn'}
In [198]:
ohe_columns = [
'MSZoning', 'Street', 'LandContour', 'LotConfig',
'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
'Exterior2nd', 'Foundation', 'Heating', 'SaleType', 'SaleCondition'
]
test = pd.get_dummies(test, columns=ohe_columns, drop_first=True)
In [199]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
print(test.info())
missing_cols = set(train.columns) - set(test.columns)
print(missing_cols)
Series([], dtype: int64)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 176 entries, Id to SaleCondition_Partial
dtypes: float64(11), int64(52), uint8(113)
memory usage: 879.2 KB
None
{'RoofMatl_Roll', 'RoofMatl_Membran', 'Condition2_RRAe', 'Condition2_RRAn', 'Heating_OthW', 'Exterior2nd_Other', 'SalePrice', 'HouseStyle_2.5Fin', 'Heating_GasA', 'RoofMatl_CompShg', 'Exterior1st_Stone', 'Condition2_RRNn', 'RoofMatl_Metal', 'Exterior1st_ImStucc'}
In [200]:
missing_cols = set(train.columns) - set(test.columns)
for col in missing_cols:
test[col] = 0
test = test[train.columns]
In [201]:
if 'SalePrice' in test.columns:
test = test.drop(columns=['SalePrice'])
In [202]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
Series([], dtype: int64)
In [203]:
missing_cols = set(train.columns) - set(test.columns)
print(missing_cols)
{'SalePrice'}
In [204]:
print(test.info())
print(test.describe())
print(test.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 189 entries, Id to SaleCondition_Partial
dtypes: float64(11), int64(65), uint8(113)
memory usage: 1.0 MB
None
Id MSSubClass LotFrontage LotArea Alley \
count 1459.000000 1459.000000 1459.000000 1459.000000 1459.000000
mean 2190.000000 57.378341 68.955106 9819.161069 0.098698
std 421.321334 42.746880 20.999091 4955.517327 0.373861
min 1461.000000 20.000000 21.000000 1470.000000 0.000000
25% 1825.500000 20.000000 60.000000 7391.000000 0.000000
50% 2190.000000 50.000000 70.000000 9399.000000 0.000000
75% 2554.500000 70.000000 80.000000 11517.500000 0.000000
max 2919.000000 190.000000 200.000000 56600.000000 2.000000
LotShape Utilities LandSlope OverallQual OverallCond ... \
count 1459.000000 1459.0 1459.000000 1459.000000 1459.000000 ...
mean 2.607951 3.0 1.954764 6.078821 5.553804 ...
std 0.557864 0.0 0.217566 1.436812 1.113740 ...
min 0.000000 3.0 0.000000 1.000000 1.000000 ...
25% 2.000000 3.0 2.000000 5.000000 5.000000 ...
50% 3.000000 3.0 2.000000 6.000000 5.000000 ...
75% 3.000000 3.0 2.000000 7.000000 6.000000 ...
max 3.000000 3.0 2.000000 10.000000 9.000000 ...
SaleType_ConLI SaleType_ConLw SaleType_New SaleType_Oth \
count 1459.000000 1459.000000 1459.000000 1459.000000
mean 0.002742 0.002056 0.080192 0.002742
std 0.052306 0.045314 0.271683 0.052306
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000
SaleType_WD SaleCondition_AdjLand SaleCondition_Alloca \
count 1459.00000 1459.000000 1459.000000
mean 0.86292 0.005483 0.008225
std 0.34405 0.073871 0.090348
min 0.00000 0.000000 0.000000
25% 1.00000 0.000000 0.000000
50% 1.00000 0.000000 0.000000
75% 1.00000 0.000000 0.000000
max 1.00000 1.000000 1.000000
SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
count 1459.000000 1459.000000 1459.000000
mean 0.017820 0.825223 0.082248
std 0.132344 0.379907 0.274837
min 0.000000 0.000000 0.000000
25% 0.000000 1.000000 0.000000
50% 0.000000 1.000000 0.000000
75% 0.000000 1.000000 0.000000
max 1.000000 1.000000 1.000000
[8 rows x 189 columns]
Id MSSubClass LotFrontage LotArea Alley LotShape Utilities \
0 1461 20 80.0 11622 0 3 3
1 1462 20 81.0 14267 0 2 3
2 1463 60 74.0 13830 0 2 3
3 1464 60 78.0 9978 0 2 3
4 1465 120 43.0 5005 0 2 3
LandSlope OverallQual OverallCond ... SaleType_ConLI SaleType_ConLw \
0 2 5 6 ... 0 0
1 2 6 6 ... 0 0
2 2 5 5 ... 0 0
3 2 6 6 ... 0 0
4 2 8 5 ... 0 0
SaleType_New SaleType_Oth SaleType_WD SaleCondition_AdjLand \
0 0 0 1 0
1 0 0 1 0
2 0 0 1 0
3 0 0 1 0
4 0 0 1 0
SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal \
0 0 0 1
1 0 0 1
2 0 0 1
3 0 0 1
4 0 0 1
SaleCondition_Partial
0 0
1 0
2 0
3 0
4 0
[5 rows x 189 columns]
Save Clean Test Data¶
In [205]:
test.to_csv('../data/clean_test.csv', index=False)
print("Cleaned test dataset saved as clean_test.csv.")
Cleaned test dataset saved as clean_test.csv.